Error in install.packages : Updating loaded packages
Error in install.packages : Updating loaded packages
Error in install.packages : Updating loaded packages
library(plyr)
library(dplyr)
library(ggplot2)
usage1 = read.csv("H:\\Downloads\\MOOC DATA\\usages.effec1.csv")
usage1
usage2 = read.csv("H:\\Downloads\\MOOC DATA\\usages.effec2.csv")
usage2
usage3 = read.csv("H:\\Downloads\\MOOC DATA\\usages.effec3.csv")
usage3
effecquest1 = read.csv("H:\\Downloads\\MOOC DATA\\effec1.quest.compil.csv")
effecquest1
install.packages("naniar")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Warning in install.packages :
package ‘naniar’ is in use and will not be installed
install.packages("plyr")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Warning in install.packages :
package ‘plyr’ is in use and will not be installed
install.packages("dplyr")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Warning in install.packages :
package ‘dplyr’ is in use and will not be installed
effecquest2 = read.csv("H:\\Downloads\\MOOC DATA\\effec2.quest.compil.csv", fileEncoding = "ISO-8859-1")
effecquest2
effecquest3 = read.csv("H:\\Downloads\\MOOC DATA\\effec3.quest.compil.csv")
effecquest3
effecquest1 <- select(effecquest1, "Student_ID",)
effecquest1
effecquest2 <- select(effecquest2, "Country" , "Formation" , "Diploma" , "Student_ID" , "Gender")
effecquest2
effecquest3 <- select (effecquest3, "Country" , "Formation" , "Diploma" , "Student_ID" , "Gender")
effecquest3
datamerge1 <- merge(usage1, effecquest1, by = "Student_ID" )
datamerge2 <- merge(usage2, effecquest2, by = "Student_ID" )
datamerge3 <- merge(usage3, effecquest3, by = "Student_ID" )
datamerge1
datamerge2
datamerge3
datamerge1 <- datamerge1 %>% mutate(iteration = 1)
datamerge2 <- datamerge2 %>% mutate(iteration = 2)
datamerge3 <- datamerge3 %>% mutate(iteration = 3)
finaldata <- rbind.fill(datamerge1, datamerge2, datamerge3)
finaldata
colnames(finaldata)
[1] "Student_ID" "Exam.score" "Exam.bin" "Assignment.score" "Assignment.bin"
[6] "Quizz.1.score" "Quizz.1.bin" "Quizz.2.score" "Quizz.2.bin" "Quizz.3.score"
[11] "Quizz.3.bin" "Quizz.4.bin" "Quizz.4.score" "Quizz.5.bin" "Quizz.5.score"
[16] "Intro.MOOC" "Prez.sem.1" "S1.L1" "S1.L2" "S1.L3"
[21] "S1.L4" "S1.L5" "S1.L6" "Prez.sem.2" "S2.L1"
[26] "S2.L2" "S2.L3" "S2.L4" "S2.L5" "S2.L6"
[31] "Prez.sem.3" "S3.L1.1" "S3.L1.2" "S3.L2" "S3.L3"
[36] "S3.L4" "S3.L5" "Prez.sem.4" "S4.L1.1" "S4.L1.2"
[41] "S4.L2" "S4.L3" "S4.L4" "S4.L5" "Prez.sem.5"
[46] "S5.L1.1" "S5.L1.2" "S5.L2" "S5.L3" "S5.L4"
[51] "S5.L5" "Post.forum.0" "view.forum.0" "Post.forum.1" "Post.forum.1.2"
[56] "view.forum.1" "view.forum.1.2" "Post.forum.2" "Post.forum.2.2" "view.forum.2"
[61] "view.forum.2.2" "Post.forum.3" "view.forum.3" "Post.forum.4" "Post.forum.4.2"
[66] "view.forum.4" "view.forum.4.2" "Post.forum.5" "Post.forum.5.2" "view.forum.5"
[71] "view.forum.5.2" "last.video" "last.quizz" "iteration" "Assignment.choice"
[76] "Country" "Formation" "Diploma" "Gender" "Post.forum.fonc.cours"
[81] "view.forum.fonc.cours"
library(dplyr)
finaldata <- finaldata %>%
mutate(Category = ifelse(Exam.bin == 1 | Assignment.bin == 1, "Completer", NA)) %>%
mutate(Category = ifelse(Exam.bin == 0 & (Assignment.bin == 1 |
Quizz.1.bin == 1 |
Quizz.2.bin == 1 |
Quizz.3.bin == 1 |
Quizz.4.bin == 1 |
Quizz.5.bin == 1),
"Disengaging Learner", Category)) %>%
mutate(Category = ifelse(Exam.bin == 0 & Assignment.bin == 0 &
Quizz.1.bin == 0 &
Quizz.2.bin == 0 &
Quizz.3.bin == 0 &
Quizz.4.bin == 0 &
Quizz.5.bin == 0 &
last.video > 10,
"Auditing Learner", Category)) %>%
mutate(Category = ifelse(Exam.bin == 0 & Assignment.bin == 0 &
Quizz.1.bin == 0 &
Quizz.2.bin == 0 &
Quizz.3.bin == 0 &
Quizz.4.bin == 0 &
Quizz.5.bin == 0 &
last.video <= 10,
"Bystander", Category))
print(finaldata)
data_summary <- finaldata %>%
group_by(iteration, Category) %>%
dplyr::summarize(count = n()) %>%
ungroup()
`summarise()` has grouped output by 'iteration'. You can override using the `.groups` argument.
#The Plot for percentage of student status by iteration
ggplot(data_summary, aes(x = factor(iteration), y = count, fill = Category)) +
geom_bar(stat = "identity", position = "fill") +
labs(title = "Percentage of Student Status by Iteration",
x = "Iteration",
y = "Percentage") +
theme_minimal() +
scale_y_continuous(labels = scales::percent_format())
Error in install.packages : Updating loaded packages
library(vcd)
mosaic(~ iteration + Category,
direction = c("v", "h"),
data = finaldata,
shade = TRUE,
main = "Mosaic Plot of Learner Categories by Iteration",
xlab = "Iteration",
ylab = "Category")
This is the start of the PHD dataset to identify errors in the data:
Phd_data<-read.csv("H:\\Downloads\\PhD.dataset.csv",header=T,na.strings="")
Phd_data
summary(Phd_data)
X Auteur Identifiant.auteur Titre Directeur.de.these
Min. : 0 Length:448047 Length:448047 Length:448047 Length:448047
1st Qu.:112012 Class :character Class :character Class :character Class :character
Median :224023 Mode :character Mode :character Mode :character Mode :character
Mean :224023
3rd Qu.:336035
Max. :448046
Directeur.de.these..nom.prenom. Identifiant.directeur Etablissement.de.soutenance Identifiant.etablissement
Length:448047 Length:448047 Length:448047 Length:448047
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
Discipline Statut Date.de.premiere.inscription.en.doctorat Date.de.soutenance Year
Length:448047 Length:448047 Length:448047 Length:448047 Min. :1971
Class :character Class :character Class :character Class :character 1st Qu.:1994
Mode :character Mode :character Mode :character Mode :character Median :2004
Mean :2003
3rd Qu.:2012
Max. :2020
NA's :57086
Langue.de.la.these Identifiant.de.la.these Accessible.en.ligne Publication.dans.theses.fr Mise.a.jour.dans.theses.fr
Length:448047 Length:448047 Length:448047 Length:448047 Length:448047
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
Discipline_prÃ.di Genre etablissement_rec Langue_rec
Length:448047 Length:448047 Length:448047 Length:448047
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
str(Phd_data)
'data.frame': 448047 obs. of 23 variables:
$ X : int 0 1 2 3 4 5 6 7 8 9 ...
$ Auteur : chr "Saeed Al marri" "Andrea Ramazzotti" "OLIVIER BODENREIDER" "Emmanuel Porte" ...
$ Identifiant.auteur : chr NA "174423705" NA NA ...
$ Titre : chr "Le credit documentaire et l'onopposabilite des exceptions" "Application de la PGD a la resolution de problemes transitoires couples en vue de l'allegement des structures composites." "Conception d'un outil informatique d'etude des cinetiques observees en toxicologie clinique" "Socio-histoire des politiques publiques en matiere sociale concernant les etudiants." ...
$ Directeur.de.these : chr "Philippe Delebecque" "Jean-Claude Grandidier,Marianne Beringhier" "Francois Kohler" "Gilles Pollet" ...
$ Directeur.de.these..nom.prenom. : chr "Delebecque Philippe" "Grandidier Jean-Claude,Beringhier Marianne" "Kohler Francois" "Pollet Gilles" ...
$ Identifiant.directeur : chr "29561248" "715,441,511" "57030758" "na" ...
$ Etablissement.de.soutenance : chr "Paris 1" "Chasseneuil-du-Poitou, Ecole nationale superieure de mecanique et d'aerotechnique" "Nancy 1" "Lyon 2" ...
$ Identifiant.etablissement : chr "27361802" "28024400" NA "02640334X" ...
$ Discipline : chr "Driot prive" "Mecanique des solides, des materiaux, des structures et des surfaces" "Medecine" "Science politique" ...
$ Statut : chr "enCours" "enCours" "soutenue" "enCours" ...
$ Date.de.premiere.inscription.en.doctorat: chr "30-09-11" "01-10-12" NA "01-06-11" ...
$ Date.de.soutenance : chr NA NA "01-01-93" NA ...
$ Year : num NA NA 1993 NA NA ...
$ Langue.de.la.these : chr "na" "na" "fr" "na" ...
$ Identifiant.de.la.these : chr "s69480" "s98826" "1993NAN19006" "s88867" ...
$ Accessible.en.ligne : chr "non" "non" "non" "non" ...
$ Publication.dans.theses.fr : chr "26-01-12" "22-11-13" "24-05-13" "12-07-13" ...
$ Mise.a.jour.dans.theses.fr : chr "26-01-12" "22-11-13" "17-11-12" "12-01-16" ...
$ Discipline_prÃ.di : chr "Droit et Science Politique" "Materiaux, Milieux et Chimie" "Medecine" "Droit et Science Politique" ...
$ Genre : chr "male" "female" "male" "male" ...
$ etablissement_rec : chr "Université Paris 1 - Panthéon Sorbonne" "École nationale supérieure de mécanique et d'aérotechnique de Poitiers" "Université de Lorraine" "Université Lumière - Lyon 2" ...
$ Langue_rec : chr "NA" "NA" "Français" "NA" ...
colnames(Phd_data)
[1] "X" "Auteur"
[3] "Identifiant.auteur" "Titre"
[5] "Directeur.de.these" "Directeur.de.these..nom.prenom."
[7] "Identifiant.directeur" "Etablissement.de.soutenance"
[9] "Identifiant.etablissement" "Discipline"
[11] "Statut" "Date.de.premiere.inscription.en.doctorat"
[13] "Date.de.soutenance" "Year"
[15] "Langue.de.la.these" "Identifiant.de.la.these"
[17] "Accessible.en.ligne" "Publication.dans.theses.fr"
[19] "Mise.a.jour.dans.theses.fr" "Discipline_prÃ.di"
[21] "Genre" "etablissement_rec"
[23] "Langue_rec"
library(naniar)
library(tidyverse)
library(dplyr)
percentage_miss <- sapply(Phd_data, function(x) sum(is.na(x)) / length(x) * 100)
percentage_miss
X Auteur Identifiant.auteur
0.000000e+00 0.000000e+00 2.909226e+01
Titre Directeur.de.these Directeur.de.these..nom.prenom.
1.562336e-03 2.901481e-03 2.901481e-03
Identifiant.directeur Etablissement.de.soutenance Identifiant.etablissement
0.000000e+00 2.231909e-04 3.812546e+00
Discipline Statut Date.de.premiere.inscription.en.doctorat
0.000000e+00 0.000000e+00 8.564191e+01
Date.de.soutenance Year Langue.de.la.these
1.274107e+01 1.274107e+01 0.000000e+00
Identifiant.de.la.these Accessible.en.ligne Publication.dans.theses.fr
0.000000e+00 0.000000e+00 0.000000e+00
Mise.a.jour.dans.theses.fr Discipline_prÃ.di Genre
3.950478e-02 0.000000e+00 0.000000e+00
etablissement_rec Langue_rec
6.860887e-01 0.000000e+00
vis_miss(Phd_data, warn_large_data = FALSE)
install.packages("vcd")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Warning in install.packages :
package ‘vcd’ is in use and will not be installed
Phd_data_binary <- Phd_data %>%
mutate(across(everything(), ~ ifelse(is.na(.), 0, 1)))
cor_matrix <- cor(Phd_data_binary)
Warning in cor(Phd_data_binary) : the standard deviation is zero
print(cor_matrix)
X Auteur Identifiant.auteur Titre Directeur.de.these
X 1 NA NA NA NA
Auteur NA 1 NA NA NA
Identifiant.auteur NA NA 1.000000000 6.170903e-03 3.848084e-03
Titre NA NA 0.006170903 1.000000e+00 1.048095e-01
Directeur.de.these NA NA 0.003848084 1.048095e-01 1.000000e+00
Directeur.de.these..nom.prenom. NA NA 0.003848084 1.048095e-01 1.000000e+00
Identifiant.directeur NA NA NA NA NA
Etablissement.de.soutenance NA NA -0.000956931 -5.905128e-06 -8.047387e-06
Identifiant.etablissement NA NA 0.198268925 -7.869352e-04 3.255071e-03
Discipline NA NA NA NA NA
Statut NA NA NA NA NA
Date.de.premiere.inscription.en.doctorat NA NA -0.637333183 -8.043235e-03 -1.576861e-04
Date.de.soutenance NA NA 0.593261540 8.650598e-03 -8.156306e-04
Year NA NA 0.593261540 8.650598e-03 -8.156306e-04
Langue.de.la.these NA NA NA NA NA
Identifiant.de.la.these NA NA NA NA NA
Accessible.en.ligne NA NA NA NA NA
Publication.dans.theses.fr NA NA NA NA NA
Mise.a.jour.dans.theses.fr NA NA -0.012733640 -7.857805e-05 -1.070846e-04
Discipline_prÃ.di NA NA NA NA NA
Genre NA NA NA NA NA
etablissement_rec NA NA 0.020580219 -3.285305e-04 -4.477146e-04
Langue_rec NA NA NA NA NA
Directeur.de.these..nom.prenom. Identifiant.directeur Etablissement.de.soutenance
X NA NA NA
Auteur NA NA NA
Identifiant.auteur 3.848084e-03 NA -9.569310e-04
Titre 1.048095e-01 NA -5.905128e-06
Directeur.de.these 1.000000e+00 NA -8.047387e-06
Directeur.de.these..nom.prenom. 1.000000e+00 NA -8.047387e-06
Identifiant.directeur NA 1 NA
Etablissement.de.soutenance -8.047387e-06 NA 1.000000e+00
Identifiant.etablissement 3.255071e-03 NA -2.974316e-04
Discipline NA NA NA
Statut NA NA NA
Date.de.premiere.inscription.en.doctorat -1.576861e-04 NA 6.117076e-04
Date.de.soutenance -8.156306e-04 NA -5.708694e-04
Year -8.156306e-04 NA -5.708694e-04
Langue.de.la.these NA NA NA
Identifiant.de.la.these NA NA NA
Accessible.en.ligne NA NA NA
Publication.dans.theses.fr NA NA NA
Mise.a.jour.dans.theses.fr -1.070846e-04 NA -2.969951e-05
Discipline_prÃ.di NA NA NA
Genre NA NA NA
etablissement_rec -4.477146e-04 NA -1.241720e-04
Langue_rec NA NA NA
Identifiant.etablissement Discipline Statut
X NA NA NA
Auteur NA NA NA
Identifiant.auteur 0.1982689250 NA NA
Titre -0.0007869352 NA NA
Directeur.de.these 0.0032550706 NA NA
Directeur.de.these..nom.prenom. 0.0032550706 NA NA
Identifiant.directeur NA NA NA
Etablissement.de.soutenance -0.0002974316 NA NA
Identifiant.etablissement 1.0000000000 NA NA
Discipline NA 1 NA
Statut NA NA 1
Date.de.premiere.inscription.en.doctorat 0.0814847706 NA NA
Date.de.soutenance -0.0759010138 NA NA
Year -0.0759010138 NA NA
Langue.de.la.these NA NA NA
Identifiant.de.la.these NA NA NA
Accessible.en.ligne NA NA NA
Publication.dans.theses.fr NA NA NA
Mise.a.jour.dans.theses.fr -0.0039578470 NA NA
Discipline_prÃ.di NA NA NA
Genre NA NA NA
etablissement_rec -0.0135824790 NA NA
Langue_rec NA NA NA
Date.de.premiere.inscription.en.doctorat Date.de.soutenance Year
X NA NA NA
Auteur NA NA NA
Identifiant.auteur -0.6373331827 0.5932615405 0.5932615405
Titre -0.0080432352 0.0086505981 0.0086505981
Directeur.de.these -0.0001576861 -0.0008156306 -0.0008156306
Directeur.de.these..nom.prenom. -0.0001576861 -0.0008156306 -0.0008156306
Identifiant.directeur NA NA NA
Etablissement.de.soutenance 0.0006117076 -0.0005708694 -0.0005708694
Identifiant.etablissement 0.0814847706 -0.0759010138 -0.0759010138
Discipline NA NA NA
Statut NA NA NA
Date.de.premiere.inscription.en.doctorat 1.0000000000 -0.9286577307 -0.9286577307
Date.de.soutenance -0.9286577307 1.0000000000 1.0000000000
Year -0.9286577307 1.0000000000 1.0000000000
Langue.de.la.these NA NA NA
Identifiant.de.la.these NA NA NA
Accessible.en.ligne NA NA NA
Publication.dans.theses.fr NA NA NA
Mise.a.jour.dans.theses.fr 0.0081398395 -0.0075964159 -0.0075964159
Discipline_prÃ.di NA NA NA
Genre NA NA NA
etablissement_rec 0.0084329391 -0.0050001283 -0.0050001283
Langue_rec NA NA NA
Langue.de.la.these Identifiant.de.la.these Accessible.en.ligne
X NA NA NA
Auteur NA NA NA
Identifiant.auteur NA NA NA
Titre NA NA NA
Directeur.de.these NA NA NA
Directeur.de.these..nom.prenom. NA NA NA
Identifiant.directeur NA NA NA
Etablissement.de.soutenance NA NA NA
Identifiant.etablissement NA NA NA
Discipline NA NA NA
Statut NA NA NA
Date.de.premiere.inscription.en.doctorat NA NA NA
Date.de.soutenance NA NA NA
Year NA NA NA
Langue.de.la.these 1 NA NA
Identifiant.de.la.these NA 1 NA
Accessible.en.ligne NA NA 1
Publication.dans.theses.fr NA NA NA
Mise.a.jour.dans.theses.fr NA NA NA
Discipline_prÃ.di NA NA NA
Genre NA NA NA
etablissement_rec NA NA NA
Langue_rec NA NA NA
Publication.dans.theses.fr Mise.a.jour.dans.theses.fr Discipline_prÃ.di Genre
X NA NA NA NA
Auteur NA NA NA NA
Identifiant.auteur NA -1.273364e-02 NA NA
Titre NA -7.857805e-05 NA NA
Directeur.de.these NA -1.070846e-04 NA NA
Directeur.de.these..nom.prenom. NA -1.070846e-04 NA NA
Identifiant.directeur NA NA NA NA
Etablissement.de.soutenance NA -2.969951e-05 NA NA
Identifiant.etablissement NA -3.957847e-03 NA NA
Discipline NA NA NA NA
Statut NA NA NA NA
Date.de.premiere.inscription.en.doctorat NA 8.139840e-03 NA NA
Date.de.soutenance NA -7.596416e-03 NA NA
Year NA -7.596416e-03 NA NA
Langue.de.la.these NA NA NA NA
Identifiant.de.la.these NA NA NA NA
Accessible.en.ligne NA NA NA NA
Publication.dans.theses.fr 1 NA NA NA
Mise.a.jour.dans.theses.fr NA 1.000000e+00 NA NA
Discipline_prÃ.di NA NA 1 NA
Genre NA NA NA 1
etablissement_rec NA -1.652326e-03 NA NA
Langue_rec NA NA NA NA
etablissement_rec Langue_rec
X NA NA
Auteur NA NA
Identifiant.auteur 0.0205802186 NA
Titre -0.0003285305 NA
Directeur.de.these -0.0004477146 NA
Directeur.de.these..nom.prenom. -0.0004477146 NA
Identifiant.directeur NA NA
Etablissement.de.soutenance -0.0001241720 NA
Identifiant.etablissement -0.0135824790 NA
Discipline NA NA
Statut NA NA
Date.de.premiere.inscription.en.doctorat 0.0084329391 NA
Date.de.soutenance -0.0050001283 NA
Year -0.0050001283 NA
Langue.de.la.these NA NA
Identifiant.de.la.these NA NA
Accessible.en.ligne NA NA
Publication.dans.theses.fr NA NA
Mise.a.jour.dans.theses.fr -0.0016523260 NA
Discipline_prÃ.di NA NA
Genre NA NA
etablissement_rec 1.0000000000 NA
Langue_rec NA 1
library(corrgram)
gg_miss_upset(Phd_data)
missing_data <- Phd_data %>%
is.na() %>%
as.data.frame()
library(corrgram)
corrgram(Phd_data,
order = TRUE,
lower.panel = panel.shade,
upper.panel = panel.pie,
text.panel = panel.txt,
main = "Corrgram of Missing Value Indicators")
library(corrplot)
library(dplyr)
na_indicator <- Phd_data %>%
mutate(across(everything(), ~ ifelse(is.na(.), 1, 0)))
na_cor_matrix <- cor(na_indicator)
Warning in cor(na_indicator) : the standard deviation is zero
corrplot(na_cor_matrix,
method = "color",
is.corr = FALSE,
type = "lower",
tl.col = "black",
tl.srt = 45, #note: rotated text 45 deg so i can read the text better
tl.cex = 0.6, #note: to make text smaller
addCoef.col = NULL,
col = colorRampPalette(c("blue", "white", "red"))(200),
na.label = " ", #note: just adding this here to remove NA (all the ?)
title = "Heatmap of Missing Value Indicators",
mar = c(0, 0, 2, 0))
gg_miss_upset(Phd_data,
nsets = 10,
nintersects = 10)
#Checking for any missing values percentage by column
sapply(Phd_data, function(x) sum(is.na(x)) / length(x) * 100)
X Auteur Identifiant.auteur
0.000000e+00 0.000000e+00 2.909226e+01
Titre Directeur.de.these Directeur.de.these..nom.prenom.
1.562336e-03 2.901481e-03 2.901481e-03
Identifiant.directeur Etablissement.de.soutenance Identifiant.etablissement
0.000000e+00 2.231909e-04 3.812546e+00
Discipline Statut Date.de.premiere.inscription.en.doctorat
0.000000e+00 0.000000e+00 8.564191e+01
Date.de.soutenance Year Langue.de.la.these
1.274107e+01 1.274107e+01 0.000000e+00
Identifiant.de.la.these Accessible.en.ligne Publication.dans.theses.fr
0.000000e+00 0.000000e+00 0.000000e+00
Mise.a.jour.dans.theses.fr Discipline_prÃ.di Genre
3.950478e-02 0.000000e+00 0.000000e+00
etablissement_rec Langue_rec
6.860887e-01 0.000000e+00
ggplot(Phd_data, aes(x = Year)) +
geom_bar() +
theme_minimal() +
labs(title = "The num of PHD Defenses by Year", x = "Year", y = "Count")
Warning: Removed 57086 rows containing non-finite outside the scale range (`stat_count()`).
#making sure no data is past 2020
Phd_data %>% filter(Year > 2020)